l1ValidationwithMultiplexPerturbations
In [ ]:
!date
Wed May 26 15:06:14 UTC 2021
Download Data¶
In [ ]:
import requests
from tqdm import tnrange, tqdm_notebook
def download_file(doi,ext):
url = 'https://api.datacite.org/dois/'+doi+'/media'
r = requests.get(url).json()
netcdf_url = r['data'][0]['attributes']['url']
r = requests.get(netcdf_url,stream=True)
#Set file name
fname = doi.split('/')[-1]+ext
#Download file with progress bar
if r.status_code == 403:
print("File Unavailable")
if 'content-length' not in r.headers:
print("Did not get file")
else:
with open(fname, 'wb') as f:
total_length = int(r.headers.get('content-length'))
pbar = tnrange(int(total_length/1024), unit="B")
for chunk in r.iter_content(chunk_size=1024):
if chunk:
pbar.update()
f.write(chunk)
return fname
In [ ]:
#De-multiplexed h5ad with gene counts for all cells, labeled in each condition (from Gehring et al 2019)
download_file('10.22002/D1.1997','.gz')
In [ ]:
!gunzip *.gz
In [ ]:
#!wget --quiet https://caltech.box.com/shared/static/s33gu9namm9b1f5ts3gs4eqxauwkfbqh
!mv D1.1997 multiplex.h5ad
Import Packages¶
In [ ]:
!pip install --quiet anndata
!pip install --quiet scanpy==1.6.0
|████████████████████████████████| 133kB 7.9MB/s
|████████████████████████████████| 7.7MB 7.2MB/s
|████████████████████████████████| 81kB 10.5MB/s
|████████████████████████████████| 1.2MB 45.2MB/s
|████████████████████████████████| 71kB 8.6MB/s
Building wheel for umap-learn (setup.py) ... done
Building wheel for sinfo (setup.py) ... done
Building wheel for pynndescent (setup.py) ... done
In [ ]:
import scanpy as sc
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import anndata
%matplotlib inline
sc.set_figure_params(dpi=125)
import seaborn as sns
sns.set(style="whitegrid")
from sklearn.metrics import pairwise_distances
Import Data¶
In [ ]:
#From Gehring et al 2019
adata = anndata.read_h5ad('multiplex.h5ad')
adata
/usr/local/lib/python3.7/dist-packages/anndata/compat/__init__.py:183: FutureWarning: Moving element from .uns['neighbors']['distances'] to .obsp['distances']. This is where adjacency matrices should go now. FutureWarning, /usr/local/lib/python3.7/dist-packages/anndata/compat/__init__.py:183: FutureWarning: Moving element from .uns['neighbors']['connectivities'] to .obsp['connectivities']. This is where adjacency matrices should go now. FutureWarning,
Out[ ]:
AnnData object with n_obs × n_vars = 21232 × 1221
obs: 'n_genes', 'n_counts', 'LouvainRes1.5', 'LouvainRes2', 'TSNE_1', 'TSNE_2', 'assignments'
var: 'n_cells'
uns: 'LouvainRes1.5_colors', 'LouvainRes2_colors', 'louvain', 'neighbors', 'pca'
obsm: 'X_pca', 'X_tsne', 'X_umap'
varm: 'PCs'
obsp: 'distances', 'connectivities'
Check Well Plate Assignments¶
In [ ]:
sc.pl.umap(adata, color='assignments', s=20)
In [ ]:
"Concentration of BMB-4 from highest (3) to lowest (0)"
BMPkey= ['3','3','3','2','2','2','1','1','1','0','0','0']
"Concentration of EGF/bFGF from highest (3) to lowest (0)"
EGFkey= ['3','3','2','2','1','1','0','0']
"Concentration of Scriptaid/Decitabine from highest (2) to lowest (0)"
ScripDeckey= ['2','1','0','2','1','0','2','1','0','2','1','0']
"1 if the sample is RA, 0 if Scriptaid/Decitabine"
RAkey= ['0','1','0','1','0','1','0','1']
In [ ]:
#Set values for BMP4
wells = list(adata.obs['assignments'])
bmp = [int(i)%12 for i in wells]
bmp = [BMPkey[i] for i in bmp]
bmp[0]
Out[ ]:
'0'
In [ ]:
#Set values for Scrip/RA Concentration
sd = [int(i)%12 for i in wells]
sd = [ScripDeckey[i] for i in sd]
sd[0]
Out[ ]:
'2'
In [ ]:
#Well rows
def retRow(val):
val = int(val)
if val < 12:
return 0;
elif val < 24:
return 1;
elif val < 36:
return 2;
elif val < 48:
return 3;
elif val < 60:
return 4;
elif val < 72:
return 5;
elif val < 84:
return 6;
elif val < 96:
return 7;
wellRows = [retRow(i) for i in wells]
print(wells[0])
print(wellRows[0])
print(wells[1])
print(wellRows[1])
33 2 43 3
In [ ]:
#Set values for EGF
print(wells[0])
egf = [EGFkey[i] for i in wellRows]
egf[0]
33
Out[ ]:
'2'
In [ ]:
#Set values for EGF
print(wells[0])
ra = [RAkey[i] for i in wellRows]
ra[0]
33
Out[ ]:
'0'
In [ ]:
plt.figure(figsize=(6,4))
sns.scatterplot(x =adata.obsm['X_umap'][:,0], y =adata.obsm['X_umap'][:,1], s=8, hue=bmp)
plt.title('BMP-4 Concentration')
plt.axis('off')
plt.grid(False)
In [ ]:
plt.figure(figsize=(6,4))
sns.scatterplot(x =adata.obsm['X_umap'][:,0], y =adata.obsm['X_umap'][:,1], s=8, hue=egf)
plt.title('BMP-4 Concentration')
plt.axis('off')
plt.grid(False)
In [ ]:
plt.figure(figsize=(6,4))
sns.scatterplot(x =adata.obsm['X_umap'][:,0], y =adata.obsm['X_umap'][:,1], s=8, hue=sd)
plt.title('BMP-4 Concentration')
plt.axis('off')
plt.grid(False)
Do L1 Distance Analysis in PCA Space for Perturbed Cells¶
In [ ]:
def doL1(cent1,cent2):
return np.linalg.norm(cent1-cent2,1)
In [ ]:
def doPairwise(cent1,cent2):
d = pairwise_distances(cent1,cent2, metric = 'l1')
return d.reshape((d.shape[0]*d.shape[1],))
In [ ]:
#Get X_PCA centroid for wells with lowest in all treatments (no perturbation)
control_PCA = np.mean(adata[adata.obs['assignments'].isin(['11','23'])].obsm['X_pca'],axis=0)
control_PCA.shape
Out[ ]:
(50,)
In [ ]:
#Get X_PCA centroid for wells with highest in BMP4, EGF, SD, RA but no other treatment
highBMP_PCA = np.mean(adata[adata.obs['assignments'].isin(['2','14'])].obsm['X_pca'],axis=0)
#highEGF_PCA = np.mean(adata[adata.obs['assignments'].isin(['11','23'])].obsm['X_pca'],axis=0)
#highSD_PCA = np.mean(adata[adata.obs['assignments'].isin(['81'])].obsm['X_pca'],axis=0) too few cells
In [ ]:
#Get X_PCA centroid for wells with second lowest (beside no addition) in BMP4, EGF, SD, RA but no other treatment
lowBMP_PCA = np.mean(adata[adata.obs['assignments'].isin(['8','20'])].obsm['X_pca'],axis=0)
lowEGF_PCA = np.mean(adata[adata.obs['assignments'].isin(['59','71'])].obsm['X_pca'],axis=0)
#lowSD_PCA = np.mean(adata[adata.obs['assignments'].isin(['82'])].obsm['X_pca'],axis=0) too few cells
In [ ]:
#Get X_PCA centroid for wells with second highest in BMP4, EGF, SD, RA but no other treatment
midBMP_PCA = np.mean(adata[adata.obs['assignments'].isin(['5','17'])].obsm['X_pca'],axis=0)
midEGF_PCA = np.mean(adata[adata.obs['assignments'].isin(['35','47'])].obsm['X_pca'],axis=0)
#lowSD_PCA = np.mean(adata[adata.obs['assignments'].isin(['82'])].obsm['X_pca'],axis=0) too few cells
In [ ]:
labs = ['No BMP4-High BMP4',
'Low-High BMP4', 'High EGF (Control)-Mid EGF',
'Mid-High BMP4', 'High EGF (Control)-Low EGF']
dists = [doL1(control_PCA,highBMP_PCA),
doL1(lowBMP_PCA,highBMP_PCA),doL1(control_PCA,midEGF_PCA),
doL1(midBMP_PCA,highBMP_PCA),doL1(control_PCA,lowEGF_PCA)]
l1dists = pd.DataFrame()
l1dists['Condition'] = labs
l1dists['Perturbation'] = ['BMP4','BMP4','EGF','BMP4','EGF']
l1dists['Distance'] = dists
In [ ]:
l1dists
Out[ ]:
| Condition | Perturbation | Distance | |
|---|---|---|---|
| 0 | No BMP4-High BMP4 | BMP4 | 38.898682 |
| 1 | Low-High BMP4 | BMP4 | 32.818871 |
| 2 | High EGF (Control)-Mid EGF | EGF | 21.955193 |
| 3 | Mid-High BMP4 | BMP4 | 15.085073 |
| 4 | High EGF (Control)-Low EGF | EGF | 37.925369 |
In [ ]:
plt. grid(False)
sns.scatterplot(data=l1dists,x='Perturbation',y='Distance',hue='Condition',palette=['skyblue','royalblue','darkgreen','blue','lightgreen'],alpha=0.6)
# Put the legend out of the figure
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plt.yticks(fontsize=10 )
plt.xticks(fontsize=10 )
plt.xlabel('Perturbant',fontsize=10)
plt.ylabel('L1 Distance',fontsize=10)
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.,fontsize=10)
plt.show()
Show pairwise distance distributions between all cells
In [ ]:
def doL1(cent1,cent2):
return np.linalg.norm(cent1-cent2,1)
In [ ]:
#Get X_PCA centroid for wells with lowest in all treatments (no perturbation)
control_PCA = adata[adata.obs['assignments'].isin(['11','23'])].obsm['X_pca']
control_PCA.shape
Out[ ]:
(442, 50)
In [ ]:
#Get X_PCA centroid for wells with highest in BMP4, EGF, SD, RA but no other treatment
highBMP_PCA = adata[adata.obs['assignments'].isin(['2','14'])].obsm['X_pca']
#highEGF_PCA = adata[adata.obs['assignments'].isin(['11','23'])].obsm['X_pca']
#highSD_PCA = np.mean(adata[adata.obs['assignments'].isin(['81'])].obsm['X_pca'],axis=0) too few cells
In [ ]:
#Get X_PCA centroid for wells with second lowest (beside no addition) in BMP4, EGF, SD, RA but no other treatment
lowBMP_PCA = adata[adata.obs['assignments'].isin(['8','20'])].obsm['X_pca']
lowEGF_PCA = adata[adata.obs['assignments'].isin(['59','71'])].obsm['X_pca']
#lowSD_PCA = np.mean(adata[adata.obs['assignments'].isin(['82'])].obsm['X_pca'],axis=0) too few cells
In [ ]:
#Get X_PCA centroid for wells with second highest in BMP4, EGF, SD, RA but no other treatment
midBMP_PCA = adata[adata.obs['assignments'].isin(['5','17'])].obsm['X_pca']
midEGF_PCA = adata[adata.obs['assignments'].isin(['35','47'])].obsm['X_pca']
#lowSD_PCA = np.mean(adata[adata.obs['assignments'].isin(['82'])].obsm['X_pca'],axis=0) too few cells
In [ ]:
labs = ['No BMP4-High BMP4',
'Low-High BMP4', 'High EGF (Control)-Mid EGF',
'Mid-High BMP4', 'High EGF (Control)-Low EGF','No BMP4-No BMP4']
dists = [doPairwise(control_PCA,highBMP_PCA),
doPairwise(lowBMP_PCA,highBMP_PCA),doPairwise(control_PCA,midEGF_PCA),
doPairwise(midBMP_PCA,highBMP_PCA),doPairwise(control_PCA,lowEGF_PCA),doPairwise(control_PCA,control_PCA)]
l1dists = pd.DataFrame()
l1dists['Condition'] = labs
l1dists['Perturbation'] = ['BMP4','BMP4','EGF','BMP4','EGF','BMP4']
l1dists['Distance'] = dists
In [ ]:
In [ ]:
l1dists.head()
Out[ ]:
| Condition | Perturbation | Distance | |
|---|---|---|---|
| 0 | No BMP4-High BMP4 | BMP4 | [79.13685408676974, 78.16393609973602, 100.440... |
| 1 | Low-High BMP4 | BMP4 | [88.14768282882869, 89.5432673767209, 110.2817... |
| 2 | High EGF (Control)-Mid EGF | EGF | [82.11652587610297, 110.98801950621419, 75.765... |
| 3 | Mid-High BMP4 | BMP4 | [56.934534843079746, 59.8543961988762, 76.6739... |
| 4 | High EGF (Control)-Low EGF | EGF | [66.36589867505245, 97.6100571389834, 115.5246... |
In [ ]:
plt. grid(False)
plt.hist(l1dists[l1dists.Condition == 'No BMP4-High BMP4'].Distance,color='skyblue',alpha=0.6,label ='No BMP4-High BMP4' ,density=True,bins=25)
plt.hist(l1dists[l1dists.Condition == 'Low-High BMP4'].Distance,color='royalblue',alpha=0.4,label = 'Low-High BMP4',density=True,bins=25)
plt.hist(l1dists[l1dists.Condition == 'Mid-High BMP4'].Distance,color='blue',alpha=0.4,label = 'Mid-High BMP4',density=True,bins=25)
plt.hist(l1dists[l1dists.Condition == 'No BMP4-No BMP4'].Distance,color='grey',alpha=0.4,label = 'No BMP4-No BMP4',density=True,bins=25)
plt.yticks(fontsize=10 )
plt.xticks(fontsize=10 )
plt.xlabel('Pairwise L1 Distance',fontsize=10)
plt.ylabel('Frequency',fontsize=10)
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.,fontsize=10)
plt.show()
#sns.histplot(data=l1dists, x="sepal_width", color="red", label="Sepal Width")
In [ ]:
plt. grid(False)
#plt.hist(l1dists[l1dists.Condition == 'Control-High EGF'].Distance,color='lightgreen',bins=15,alpha=0.6,label ='Control-High EGF' ,density=True)
plt.hist(l1dists[l1dists.Condition == 'High EGF (Control)-Low EGF'].Distance,color='lightgreen',bins=25,alpha=0.6,label = 'High EGF (Control)-Low EGF',density=True)
plt.hist(l1dists[l1dists.Condition == 'High EGF (Control)-Mid EGF'].Distance,color='darkgreen',bins=25,alpha=0.6,label = 'High EGF (Control)-Mid EGF',density=True)
plt.xlabel('Pairwise L1 Distance')
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plt.show()
In [ ]: